Data Preparation
#For forecasting, we chose the latest data
trend_hosp_hb <- trend_hb_daily %>%
filter (hb_name == "Scotland") %>%
filter(date >="2021-06-01") %>%
filter(!(is.na(hospital_admissions))) %>%
select(date, hospital_admissions)
# Convert it into a time series
daily_hosp_hb_zoo <- zoo(trend_hosp_hb$hospital_admissions,
order.by=as.Date(trend_hosp_hb$date, format='%m/%d/%Y'))
# Convert it into a time series
daily_hosp_hb_timeseries <- timeSeries::as.timeSeries(daily_hosp_hb_zoo)
ARIMA MODEL
Step 1 : Visualize the time series
p<-autoplot(daily_hosp_hb_timeseries, ts.colour = '#5ab4ac')+
xlab("Month") +
ylab("Number of People hospitalised")+
#scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Trend on Hospitalisation") +
color_theme()
ggplotly(p)
Step 2 : Identification of model : (Finding d:)
Identify whether the time series is stationary / non stationary we can use ADF Augmented Dickey-Fuller test
adf_test_hosp <- adf.test(daily_hosp_hb_timeseries)
adf_test_hosp
Augmented Dickey-Fuller Test
data: daily_hosp_hb_timeseries
Dickey-Fuller = -1.1773, Lag order = 4, p-value = 0.907
alternative hypothesis: stationary
The time series is not stationary since we have a high p-value (p-value must be < 0.05). So we apply difference
first_diff_hosp<- diff(daily_hosp_hb_timeseries)
adf_test1_hosp <- adf.test(na.omit(first_diff_hosp))
Warning in adf.test(na.omit(first_diff_hosp)) :
p-value smaller than printed p-value
adf_test1_hosp
Augmented Dickey-Fuller Test
data: na.omit(first_diff_hosp)
Dickey-Fuller = -7.8429, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
Create a dataframe to compare
adf_data_hosp <- data.frame(Data = c("Original", "First-Ordered"),
Dickey_Fuller = c(adf_test_hosp$statistic, adf_test1_hosp$statistic),
p_value = c(adf_test_hosp$p.value,adf_test1_hosp$p.value))
adf_data_hosp
Initially the p-value is high which indicates that the Time Series is not stationary. So we apply difference 1 time. After the first difference, the p-value < significance level (0.05) So we can conclude that the difference data are stationary. So difference (d = 1)
Other method:
ndiffs(daily_hosp_hb_timeseries)
[1] 1
Let’s plot the First Order Difference Series
Order of first difference
first_diff_ts<- diff(daily_hosp_hb_timeseries)
p<- autoplot(first_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("HOSPITALIZATION")+
# scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("First-Order Difference Series") +
color_theme()
ggplotly(p)
For our model ARIMA (p,d,q), we found d = 1, the next step is to get the values of p and q, the order of AR and MA part. Plot ACF and PACF charts to identify q and p respectively.
[1] -0.20 -0.15 -0.05 -0.04 -0.12 0.13 0.31 0.07 -0.12 -0.08 -0.12 -0.09 0.15 0.31 -0.05 -0.15 -0.07 -0.08 -0.14
[20] 0.19 0.12 0.08
[1] -0.20 -0.20 -0.13 -0.13 -0.22 0.00 0.31 0.32 0.18 0.09 -0.06 -0.19 -0.14 0.11 0.02 -0.09 -0.08 -0.03 -0.15
[20] -0.05 -0.14 0.04
The ACF and PACF plots of the differenced data show the following patterns:
The ACF is sinusoidal and there is a significant spike at lag 3 in the PACF, but none beyond lag 3. So the data may follow an AR(3) model
The PACF is sinusoidal and there is a significant spike at lag 2 in the ACF, but none beyond lag 2 So the data may follow an MA(2) model
So we propose three ARMA models for the differenced data: ARMA(p,q) ARMA(3,2), ARMA(3,0) and ARMA(0,2).
That is, for the original time series, we propose three ARIMA models,ARIMA(p,d,q) ARIMA(3,1,2), ARIMA(3,1,0) and ARMA(3,1,2).
Manual Model:
arima_fit1 = Arima(daily_hosp_hb_timeseries, order = c(3,1,2))
arima_fit2 = Arima(daily_hosp_hb_timeseries, order = c(3,1,0))
arima_fit3 = Arima(daily_hosp_hb_timeseries, order = c(3,1,2))
arima_fit4 = Arima(daily_hosp_hb_timeseries, order = c(3,1,1))
summary(arima_fit1)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.9278 -0.5751 -0.2510 -1.3813 0.8995
s.e. 0.1167 0.1159 0.1318 0.1148 0.0459
sigma^2 estimated as 216.7: log likelihood=-512.03
AIC=1036.06 AICc=1036.77 BIC=1053.03
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.4996576 14.36433 11.29031 -2.312697 17.82414 0.8440723 -0.03117281
summary(arima_fit2)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,0)
Coefficients:
ar1 ar2 ar3
-0.2680 -0.2282 -0.1314
s.e. 0.0891 0.0894 0.0886
sigma^2 estimated as 271.1: log likelihood=-526.09
AIC=1060.18 AICc=1060.52 BIC=1071.5
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.5172473 16.2017 12.65275 -2.83438 19.60977 0.9459289 -0.01927192
summary(arima_fit3)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.9278 -0.5751 -0.2510 -1.3813 0.8995
s.e. 0.1167 0.1159 0.1318 0.1148 0.0459
sigma^2 estimated as 216.7: log likelihood=-512.03
AIC=1036.06 AICc=1036.77 BIC=1053.03
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.4996576 14.36433 11.29031 -2.312697 17.82414 0.8440723 -0.03117281
summary(arima_fit4)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,1)
Coefficients:
ar1 ar2 ar3 ma1
0.1675 -0.1301 -0.0808 -0.4598
s.e. 0.1941 0.1001 0.1021 0.1745
sigma^2 estimated as 268.1: log likelihood=-524.91
AIC=1059.81 AICc=1060.32 BIC=1073.96
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.6034759 16.04472 12.61836 -2.754632 19.47254 0.9433583 -0.009659787
Forecast the Manual ARIMA model
# Forecast the manual models
future = forecast(arima_fit1, h = 30)
future2 = forecast(arima_fit2, h = 30)
future3 = forecast(arima_fit3, h = 30)
future4 = forecast(arima_fit4, h = 30)
#Plot the forecasted manual models
par(mfrow = c(2,2))
plot(future)
plot(future2)
plot(future3)
plot(future4)
(Automated ARIMA)
auto_arima_fit_hosp <- auto.arima(daily_hosp_hb_timeseries,
seasonal=FALSE,
stepwise=FALSE,
approximation=FALSE,
trace = TRUE
)
ARIMA(0,1,0) : 1066.532
ARIMA(0,1,0) with drift : 1068.54
ARIMA(0,1,1) : 1059.624
ARIMA(0,1,1) with drift : 1061.597
ARIMA(0,1,2) : 1057.053
ARIMA(0,1,2) with drift : 1058.998
ARIMA(0,1,3) : 1059.18
ARIMA(0,1,3) with drift : 1061.159
ARIMA(0,1,4) : Inf
ARIMA(0,1,4) with drift : Inf
ARIMA(0,1,5) : Inf
ARIMA(0,1,5) with drift : Inf
ARIMA(1,1,0) : 1063.419
ARIMA(1,1,0) with drift : 1065.439
ARIMA(1,1,1) : 1058
ARIMA(1,1,1) with drift : 1059.948
ARIMA(1,1,2) : 1059.184
ARIMA(1,1,2) with drift : 1061.163
ARIMA(1,1,3) : 1060.633
ARIMA(1,1,3) with drift : 1062.644
ARIMA(1,1,4) : Inf
ARIMA(1,1,4) with drift : Inf
ARIMA(2,1,0) : 1060.559
ARIMA(2,1,0) with drift : 1062.583
ARIMA(2,1,1) : 1058.73
ARIMA(2,1,1) with drift : 1060.708
ARIMA(2,1,2) : Inf
ARIMA(2,1,2) with drift : Inf
ARIMA(2,1,3) : 1063.1
ARIMA(2,1,3) with drift : 1065.154
ARIMA(3,1,0) : 1060.516
ARIMA(3,1,0) with drift : 1062.545
ARIMA(3,1,1) : 1060.319
ARIMA(3,1,1) with drift : 1062.325
ARIMA(3,1,2) : 1036.774
ARIMA(3,1,2) with drift : 1038.835
ARIMA(4,1,0) : 1060.598
ARIMA(4,1,0) with drift : 1062.636
ARIMA(4,1,1) : 1061.11
ARIMA(4,1,1) with drift : 1063.136
ARIMA(5,1,0) : 1056.561
ARIMA(5,1,0) with drift : 1058.52
Best model: ARIMA(3,1,2)
summary(auto_arima_fit_hosp)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.9278 -0.5751 -0.2510 -1.3813 0.8995
s.e. 0.1167 0.1159 0.1318 0.1148 0.0459
sigma^2 estimated as 216.7: log likelihood=-512.03
AIC=1036.06 AICc=1036.77 BIC=1053.03
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.4996576 14.36433 11.29031 -2.312697 17.82414 0.8440723 -0.03117281
Automated ARIMA confirms that the ARIMA(3,1,2) seems good based on AIC
lmtest::coeftest(auto_arima_fit_hosp)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
ar1 0.927809 0.116709 7.9498 1.869e-15 ***
ar2 -0.575083 0.115853 -4.9639 6.910e-07 ***
ar3 -0.250958 0.131753 -1.9048 0.05681 .
ma1 -1.381267 0.114776 -12.0345 < 2.2e-16 ***
ma2 0.899495 0.045893 19.5999 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
All coefficients are significant except ar3.
Model Selection Criteria :
ARIMA models with minimum AIC, RMSE and MAPE criteria were chosen as the best models. Based on Akaike Information Criterion (AIC) above, an ARIMA(3, 1, 2) model seems best.
Let’s plot the diagnostics with the results to make sure the normality and correlation assumptions for the model hold. If the residuals look like white noise, proceed with forecast and prediction, otherwise repeat the model building.
checkresiduals(auto_arima_fit_hosp, theme = color_theme())
Ljung-Box test
data: Residuals from ARIMA(3,1,2)
Q* = 21.843, df = 5, p-value = 0.0005609
Model df: 5. Total lags used: 10
The ACF plot of the residuals from the ARIMA(3,1,2) model shows that all auto correlations are almost within the threshold limits, with 2 outliers. A portmanteau test (Ljung-Box test) returns a smaller p-value, also suggesting that the residuals are white noise.
Fitting the ARIMA model with the existing data
The residual errors seem fine with near zero mean and uniform variance. Let’s plot the actuals against the fitted values
daily_hosp_hb_timeseries_data <- fortify(daily_hosp_hb_timeseries) %>%
clean_names() %>%
remove_rownames %>%
rename (date = index,
hosp = data)%>%
mutate(index = seq(1:nrow(daily_hosp_hb_timeseries)))
arima_fit_resid <- ts(daily_hosp_hb_timeseries) - resid(auto_arima_fit_hosp)
arima_fit_data <- fortify(arima_fit_resid) %>%
clean_names() %>%
mutate(data = round(data,2))
fit_existing_data <- daily_hosp_hb_timeseries_data %>%
inner_join(arima_fit_data, by = c("index"))
#plotting the series along with the fitted values
fit_existing_data %>%
ggplot()+
aes(x=date, y = hosp)+
geom_line(color ="#5ab4ac")+
geom_line(aes(x= date, y = data), colour = "red" )+
xlab("Month") +
ylab("Number of People vaccinated")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Fitting the ARIMA model with existing data") +
#scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
6 Forecast using the model
Data Preparation :
forecast_model <- forecast(auto_arima_fit_hosp,level = c(80, 95), h = 60)
#Convert the model to dataframe for plotting
forecast_model_data <- fortify(forecast_model) %>%
clean_names() %>%
mutate(data = round(data,2),
fitted= round(fitted,2))
forecast_start_date <- as.Date(max(daily_hosp_hb_timeseries_data$date)+1)
forecast_end_date <- as.Date(forecast_start_date+59)
forecast_data <- forecast_model_data %>%
filter(!(is.na(point_forecast))) %>%
mutate(date = seq(forecast_start_date,forecast_end_date, by =1)) %>%
select(-data,-fitted, -index)
fitted_data <- forecast_model_data %>%
filter(!(is.na(data))) %>%
inner_join(daily_hosp_hb_timeseries_data, by = c("index")) %>%
mutate(date = as.Date(date)) %>%
select(date, data, fitted)
#Plotting the Vaccination series plus the forecast and 80 - 95% prediction intervals
annotation <- data.frame(
x = c(as.Date("13-06-2021","%d-%m-%Y"),as.Date("31-10-2021","%d-%m-%Y")),
y = c(100,200),
label = c("PAST", "FUTURE")
)
#Time series plots for the next 60 days according to best ARIMA models with 80%–95% CI.
fitted_data %>%
ggplot()+
geom_line(aes(x= date, y = data), color = "#5ab4ac")+
geom_line(aes(x= date, y = fitted), colour = "red" )+
geom_line(aes(x= date, y =point_forecast), color ="blue", data = forecast_data )+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_80, ymax = hi_80),
data = forecast_data, alpha = 0.3, fill = "green")+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_95, ymax = hi_95),
data = forecast_data, alpha = 0.1)+
#geom_forecast(aes(x= date, y =point_forecast), data = forecast_data )+
ggtitle("Forecast") +
xlab("Month") +
ylab("Patient Hospitalised")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
color_theme()+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
geom_text(data=annotation,
aes( x=x, y=y, label=label),
color="red",
size=4 )+
geom_vline(xintercept =as.Date("03-10-2021","%d-%m-%Y"), linetype = 2)